// HTMLParser Library $Name: v1_6 $ - A java-based parser for HTML
// http://sourceforge.org/projects/htmlparser
// Copyright (C) 2004 Derrick Oswald
//
// Revision Control Information
//
// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/util/Translate.java,v $
// $Author: derrickoswald $
// $Date: 2006/06/04 19:17:21 $
// $Revision: 1.47 $
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//

package org.htmlparser.util;

import org.htmlparser.util.sort.Sort;

import java.io.*;

/**
 * Extended character entity reference. Handles kernels within other strings,
 * just for lookup purposes.
 */
class CharacterReferenceEx extends CharacterReference {
    /**
     * The starting point in the string.
     */
    protected int mStart;

    /**
     * The ending point in the string.
     */
    protected int mEnd;

    /**
     * Zero args constructor. This object is only ever used after setting the
     * kernel, start and end.
     */
    public CharacterReferenceEx() {
        super("", 0);
    }

    /**
     * Set the starting point of the kernel.
     */
    public void setStart(int start) {
        mStart = start;
    }

    /**
     * Set the supposed ending point. This only specifies an upper bound on the
     * kernel length.
     */
    public void setEnd(int end) {
        mEnd = end;
    }

    /**
     * Get this CharacterReference's kernel.
     *
     * @return The kernel in the equivalent character entity reference.
     */
    public String getKernel() {
        return (mKernel.substring(mStart, mEnd));
    }

    //
    // Ordered interface
    //

    /**
     * Compare one reference to another.
     *
     * @see org.htmlparser.util.sort.Ordered
     */
    public int compare(Object that) {
        CharacterReference r;
        String kernel;
        int length;
        int ret;

        ret = 0;
        r = (CharacterReference) that;
        kernel = r.getKernel();
        length = kernel.length();
        for (int i = mStart, j = 0; i < mEnd; i++, j++) {
            if (j >= length) {
                ret = 1;
                break;
            }
            ret = mKernel.charAt(i) - kernel.charAt(j);
            if (0 != ret) break;
        }

        return (ret);
    }
}

/**
 * Translate numeric character references and character entity references to
 * unicode characters. Based on tables found at <a
 * href="http://www.w3.org/TR/REC-html40/sgml/entities.html">
 * http://www.w3.org/TR/REC-html40/sgml/entities.html</a>
 * <p/>
 * Typical usage:
 * <p/>
 * <pre>
 * String s = Translate.decode(getTextFromHtmlPage());
 * </pre>
 * <p/>
 * or
 * <p/>
 * <pre>
 * String s = &quot;&lt;HTML&gt;&quot; + Translate.encode(getArbitraryText()) + &quot;&lt;/HTML&gt;&quot;;
 * </pre>
 */
public class Translate {
    /**
     * If this member is set <code>true</code>, decoding of streams is done line
     * by line in order to reduce the maximum memory required.
     */
    static public boolean DECODE_LINE_BY_LINE = false;

    /**
     * If this member is set <code>true</code>, encoding of numeric character
     * references uses hexadecimal digits, i.e. &amp;#x25CB;, instead of decimal
     * digits.
     */
    static public boolean ENCODE_HEXADECIMAL = false;

    /**
     * Table mapping entity reference kernel to character. This is sorted by
     * kernel when the class is loaded.
     */
    protected static final CharacterReference[] mCharacterReferences = {
            // Portions © International Organization for Standardization 1986
            // Permission to copy in any form is granted for use with
            // conforming SGML systems and applications as defined in
            // ISO 8879, provided this notice is included in all copies.
            // Character entity set. Typical invocation:
            // <!ENTITY % HTMLlat1 PUBLIC
            // "-//W3C//ENTITIES Latin 1//EN//HTML">
            // %HTMLlat1;
            new CharacterReference("nbsp", '\u00a0'), // no-break space =
            // non-breaking space,
            // U+00A0 ISOnum
            new CharacterReference("iexcl", '\u00a1'), // inverted exclamation
            // mark, U+00A1 ISOnum
            new CharacterReference("cent", '\u00a2'), // cent sign, U+00A2
            // ISOnum
            new CharacterReference("pound", '\u00a3'), // pound sign, U+00A3
            // ISOnum
            new CharacterReference("curren", '\u00a4'), // currency sign, U+00A4
            // ISOnum
            new CharacterReference("yen", '\u00a5'), // yen sign = yuan sign,
            // U+00A5 ISOnum
            new CharacterReference("brvbar", '\u00a6'), // broken bar = broken
            // vertical bar, U+00A6
            // ISOnum
            new CharacterReference("sect", '\u00a7'), // section sign, U+00A7
            // ISOnum
            new CharacterReference("uml", '\u00a8'), // diaeresis = spacing
            // diaeresis, U+00A8
            // ISOdia
            new CharacterReference("copy", '\u00a9'), // copyright sign, U+00A9
            // ISOnum
            new CharacterReference("ordf", '\u00aa'), // feminine ordinal
            // indicator, U+00AA
            // ISOnum
            new CharacterReference("laquo", '\u00ab'), // left-pointing double
            // angle quotation mark
            // = left pointing
            // guillemet, U+00AB
            // ISOnum
            new CharacterReference("not", '\u00ac'), // not sign, U+00AC ISOnum
            new CharacterReference("shy", '\u00ad'), // soft hyphen =
            // discretionary hyphen,
            // U+00AD ISOnum
            new CharacterReference("reg", '\u00ae'), // registered sign =
            // registered trade mark
            // sign, U+00AE ISOnum
            new CharacterReference("macr", '\u00af'), // macron = spacing macron
            // = overline = APL
            // overbar, U+00AF
            // ISOdia
            new CharacterReference("deg", '\u00b0'), // degree sign, U+00B0
            // ISOnum
            new CharacterReference("plusmn", '\u00b1'), // plus-minus sign =
            // plus-or-minus sign,
            // U+00B1 ISOnum
            new CharacterReference("sup2", '\u00b2'), // superscript two =
            // superscript digit two
            // = squared, U+00B2
            // ISOnum
            new CharacterReference("sup3", '\u00b3'), // superscript three =
            // superscript digit
            // three = cubed, U+00B3
            // ISOnum
            new CharacterReference("acute", '\u00b4'), // acute accent = spacing
            // acute, U+00B4 ISOdia
            new CharacterReference("micro", '\u00b5'), // micro sign, U+00B5
            // ISOnum
            new CharacterReference("para", '\u00b6'), // pilcrow sign =
            // paragraph sign,
            // U+00B6 ISOnum
            new CharacterReference("middot", '\u00b7'), // middle dot = Georgian
            // comma = Greek middle
            // dot, U+00B7 ISOnum
            new CharacterReference("cedil", '\u00b8'), // cedilla = spacing
            // cedilla, U+00B8
            // ISOdia
            new CharacterReference("sup1", '\u00b9'), // superscript one =
            // superscript digit
            // one, U+00B9 ISOnum
            new CharacterReference("ordm", '\u00ba'), // masculine ordinal
            // indicator, U+00BA
            // ISOnum
            new CharacterReference("raquo", '\u00bb'), // right-pointing double
            // angle quotation mark
            // = right pointing
            // guillemet, U+00BB
            // ISOnum
            new CharacterReference("frac14", '\u00bc'), // vulgar fraction one
            // quarter = fraction
            // one quarter, U+00BC
            // ISOnum
            new CharacterReference("frac12", '\u00bd'), // vulgar fraction one
            // half = fraction one
            // half, U+00BD ISOnum
            new CharacterReference("frac34", '\u00be'), // vulgar fraction three
            // quarters = fraction
            // three quarters,
            // U+00BE ISOnum
            new CharacterReference("iquest", '\u00bf'), // inverted question
            // mark = turned
            // question mark, U+00BF
            // ISOnum
            new CharacterReference("Agrave", '\u00c0'), // latin capital letter
            // A with grave = latin
            // capital letter A
            // grave, U+00C0 ISOlat1
            new CharacterReference("Aacute", '\u00c1'), // latin capital letter
            // A with acute, U+00C1
            // ISOlat1
            new CharacterReference("Acirc", '\u00c2'), // latin capital letter A
            // with circumflex,
            // U+00C2 ISOlat1
            new CharacterReference("Atilde", '\u00c3'), // latin capital letter
            // A with tilde, U+00C3
            // ISOlat1
            new CharacterReference("Auml", '\u00c4'), // latin capital letter A
            // with diaeresis,
            // U+00C4 ISOlat1
            new CharacterReference("Aring", '\u00c5'), // latin capital letter A
            // with ring above =
            // latin capital letter
            // A ring, U+00C5
            // ISOlat1
            new CharacterReference("AElig", '\u00c6'), // latin capital letter
            // AE = latin capital
            // ligature AE, U+00C6
            // ISOlat1
            new CharacterReference("Ccedil", '\u00c7'), // latin capital letter
            // C with cedilla,
            // U+00C7 ISOlat1
            new CharacterReference("Egrave", '\u00c8'), // latin capital letter
            // E with grave, U+00C8
            // ISOlat1
            new CharacterReference("Eacute", '\u00c9'), // latin capital letter
            // E with acute, U+00C9
            // ISOlat1
            new CharacterReference("Ecirc", '\u00ca'), // latin capital letter E
            // with circumflex,
            // U+00CA ISOlat1
            new CharacterReference("Euml", '\u00cb'), // latin capital letter E
            // with diaeresis,
            // U+00CB ISOlat1
            new CharacterReference("Igrave", '\u00cc'), // latin capital letter
            // I with grave, U+00CC
            // ISOlat1
            new CharacterReference("Iacute", '\u00cd'), // latin capital letter
            // I with acute, U+00CD
            // ISOlat1
            new CharacterReference("Icirc", '\u00ce'), // latin capital letter I
            // with circumflex,
            // U+00CE ISOlat1
            new CharacterReference("Iuml", '\u00cf'), // latin capital letter I
            // with diaeresis,
            // U+00CF ISOlat1
            new CharacterReference("ETH", '\u00d0'), // latin capital letter
            // ETH, U+00D0 ISOlat1
            new CharacterReference("Ntilde", '\u00d1'), // latin capital letter
            // N with tilde, U+00D1
            // ISOlat1
            new CharacterReference("Ograve", '\u00d2'), // latin capital letter
            // O with grave, U+00D2
            // ISOlat1
            new CharacterReference("Oacute", '\u00d3'), // latin capital letter
            // O with acute, U+00D3
            // ISOlat1
            new CharacterReference("Ocirc", '\u00d4'), // latin capital letter O
            // with circumflex,
            // U+00D4 ISOlat1
            new CharacterReference("Otilde", '\u00d5'), // latin capital letter
            // O with tilde, U+00D5
            // ISOlat1
            new CharacterReference("Ouml", '\u00d6'), // latin capital letter O
            // with diaeresis,
            // U+00D6 ISOlat1
            new CharacterReference("times", '\u00d7'), // multiplication sign,
            // U+00D7 ISOnum
            new CharacterReference("Oslash", '\u00d8'), // latin capital letter
            // O with stroke = latin
            // capital letter O
            // slash, U+00D8 ISOlat1
            new CharacterReference("Ugrave", '\u00d9'), // latin capital letter
            // U with grave, U+00D9
            // ISOlat1
            new CharacterReference("Uacute", '\u00da'), // latin capital letter
            // U with acute, U+00DA
            // ISOlat1
            new CharacterReference("Ucirc", '\u00db'), // latin capital letter U
            // with circumflex,
            // U+00DB ISOlat1
            new CharacterReference("Uuml", '\u00dc'), // latin capital letter U
            // with diaeresis,
            // U+00DC ISOlat1
            new CharacterReference("Yacute", '\u00dd'), // latin capital letter
            // Y with acute, U+00DD
            // ISOlat1
            new CharacterReference("THORN", '\u00de'), // latin capital letter
            // THORN, U+00DE ISOlat1
            new CharacterReference("szlig", '\u00df'), // latin small letter
            // sharp s = ess-zed,
            // U+00DF ISOlat1
            new CharacterReference("agrave", '\u00e0'), // latin small letter a
            // with grave = latin
            // small letter a grave,
            // U+00E0 ISOlat1
            new CharacterReference("aacute", '\u00e1'), // latin small letter a
            // with acute, U+00E1
            // ISOlat1
            new CharacterReference("acirc", '\u00e2'), // latin small letter a
            // with circumflex,
            // U+00E2 ISOlat1
            new CharacterReference("atilde", '\u00e3'), // latin small letter a
            // with tilde, U+00E3
            // ISOlat1
            new CharacterReference("auml", '\u00e4'), // latin small letter a
            // with diaeresis,
            // U+00E4 ISOlat1
            new CharacterReference("aring", '\u00e5'), // latin small letter a
            // with ring above =
            // latin small letter a
            // ring, U+00E5 ISOlat1
            new CharacterReference("aelig", '\u00e6'), // latin small letter ae
            // = latin small
            // ligature ae, U+00E6
            // ISOlat1
            new CharacterReference("ccedil", '\u00e7'), // latin small letter c
            // with cedilla, U+00E7
            // ISOlat1
            new CharacterReference("egrave", '\u00e8'), // latin small letter e
            // with grave, U+00E8
            // ISOlat1
            new CharacterReference("eacute", '\u00e9'), // latin small letter e
            // with acute, U+00E9
            // ISOlat1
            new CharacterReference("ecirc", '\u00ea'), // latin small letter e
            // with circumflex,
            // U+00EA ISOlat1
            new CharacterReference("euml", '\u00eb'), // latin small letter e
            // with diaeresis,
            // U+00EB ISOlat1
            new CharacterReference("igrave", '\u00ec'), // latin small letter i
            // with grave, U+00EC
            // ISOlat1
            new CharacterReference("iacute", '\u00ed'), // latin small letter i
            // with acute, U+00ED
            // ISOlat1
            new CharacterReference("icirc", '\u00ee'), // latin small letter i
            // with circumflex,
            // U+00EE ISOlat1
            new CharacterReference("iuml", '\u00ef'), // latin small letter i
            // with diaeresis,
            // U+00EF ISOlat1
            new CharacterReference("eth", '\u00f0'), // latin small letter eth,
            // U+00F0 ISOlat1
            new CharacterReference("ntilde", '\u00f1'), // latin small letter n
            // with tilde, U+00F1
            // ISOlat1
            new CharacterReference("ograve", '\u00f2'), // latin small letter o
            // with grave, U+00F2
            // ISOlat1
            new CharacterReference("oacute", '\u00f3'), // latin small letter o
            // with acute, U+00F3
            // ISOlat1
            new CharacterReference("ocirc", '\u00f4'), // latin small letter o
            // with circumflex,
            // U+00F4 ISOlat1
            new CharacterReference("otilde", '\u00f5'), // latin small letter o
            // with tilde, U+00F5
            // ISOlat1
            new CharacterReference("ouml", '\u00f6'), // latin small letter o
            // with diaeresis,
            // U+00F6 ISOlat1
            new CharacterReference("divide", '\u00f7'), // division sign, U+00F7
            // ISOnum
            new CharacterReference("oslash", '\u00f8'), // latin small letter o
            // with stroke, = latin
            // small letter o slash,
            // U+00F8 ISOlat1
            new CharacterReference("ugrave", '\u00f9'), // latin small letter u
            // with grave, U+00F9
            // ISOlat1
            new CharacterReference("uacute", '\u00fa'), // latin small letter u
            // with acute, U+00FA
            // ISOlat1
            new CharacterReference("ucirc", '\u00fb'), // latin small letter u
            // with circumflex,
            // U+00FB ISOlat1
            new CharacterReference("uuml", '\u00fc'), // latin small letter u
            // with diaeresis,
            // U+00FC ISOlat1
            new CharacterReference("yacute", '\u00fd'), // latin small letter y
            // with acute, U+00FD
            // ISOlat1
            new CharacterReference("thorn", '\u00fe'), // latin small letter
            // thorn, U+00FE ISOlat1
            new CharacterReference("yuml", '\u00ff'), // latin small letter y
            // with diaeresis,
            // U+00FF ISOlat1
            // Mathematical, Greek and Symbolic characters for HTML
            // Character entity set. Typical invocation:
            // <!ENTITY % HTMLsymbol PUBLIC
            // "-//W3C//ENTITIES Symbols//EN//HTML">
            // %HTMLsymbol;
            // Portions © International Organization for Standardization 1986:
            // Permission to copy in any form is granted for use with
            // conforming SGML systems and applications as defined in
            // ISO 8879, provided this notice is included in all copies.
            // Relevant ISO entity set is given unless names are newly
            // introduced.
            // New names (i.e., not in ISO 8879 list) do not clash with any
            // existing ISO 8879 entity names. ISO 10646 character numbers
            // are given for each character, in hex. CDATA values are decimal
            // conversions of the ISO 10646 values and refer to the document
            // character set. Names are ISO 10646 names.
            // Latin Extended-B
            new CharacterReference("fnof", '\u0192'), // latin small f with hook
            // = function = florin,
            // U+0192 ISOtech
            // Greek
            new CharacterReference("Alpha", '\u0391'), // greek capital letter
            // alpha, U+0391
            new CharacterReference("Beta", '\u0392'), // greek capital letter
            // beta, U+0392
            new CharacterReference("Gamma", '\u0393'), // greek capital letter
            // gamma, U+0393 ISOgrk3
            new CharacterReference("Delta", '\u0394'), // greek capital letter
            // delta, U+0394 ISOgrk3
            new CharacterReference("Epsilon", '\u0395'), // greek capital letter
            // epsilon, U+0395
            new CharacterReference("Zeta", '\u0396'), // greek capital letter
            // zeta, U+0396
            new CharacterReference("Eta", '\u0397'), // greek capital letter
            // eta, U+0397
            new CharacterReference("Theta", '\u0398'), // greek capital letter
            // theta, U+0398 ISOgrk3
            new CharacterReference("Iota", '\u0399'), // greek capital letter
            // iota, U+0399
            new CharacterReference("Kappa", '\u039a'), // greek capital letter
            // kappa, U+039A
            new CharacterReference("Lambda", '\u039b'), // greek capital letter
            // lambda, U+039B
            // ISOgrk3
            new CharacterReference("Mu", '\u039c'), // greek capital letter mu,
            // U+039C
            new CharacterReference("Nu", '\u039d'), // greek capital letter nu,
            // U+039D
            new CharacterReference("Xi", '\u039e'), // greek capital letter xi,
            // U+039E ISOgrk3
            new CharacterReference("Omicron", '\u039f'), // greek capital letter
            // omicron, U+039F
            new CharacterReference("Pi", '\u03a0'), // greek capital letter pi,
            // U+03A0 ISOgrk3
            new CharacterReference("Rho", '\u03a1'), // greek capital letter
            // rho, U+03A1
            // there is no Sigmaf, and no U+03A2 character either
            new CharacterReference("Sigma", '\u03a3'), // greek capital letter
            // sigma, U+03A3 ISOgrk3
            new CharacterReference("Tau", '\u03a4'), // greek capital letter
            // tau, U+03A4
            new CharacterReference("Upsilon", '\u03a5'), // greek capital letter
            // upsilon, U+03A5
            // ISOgrk3
            new CharacterReference("Phi", '\u03a6'), // greek capital letter
            // phi, U+03A6 ISOgrk3
            new CharacterReference("Chi", '\u03a7'), // greek capital letter
            // chi, U+03A7
            new CharacterReference("Psi", '\u03a8'), // greek capital letter
            // psi, U+03A8 ISOgrk3
            new CharacterReference("Omega", '\u03a9'), // greek capital letter
            // omega, U+03A9 ISOgrk3
            new CharacterReference("alpha", '\u03b1'), // greek small letter
            // alpha, U+03B1 ISOgrk3
            new CharacterReference("beta", '\u03b2'), // greek small letter
            // beta, U+03B2 ISOgrk3
            new CharacterReference("gamma", '\u03b3'), // greek small letter
            // gamma, U+03B3 ISOgrk3
            new CharacterReference("delta", '\u03b4'), // greek small letter
            // delta, U+03B4 ISOgrk3
            new CharacterReference("epsilon", '\u03b5'), // greek small letter
            // epsilon, U+03B5
            // ISOgrk3
            new CharacterReference("zeta", '\u03b6'), // greek small letter
            // zeta, U+03B6 ISOgrk3
            new CharacterReference("eta", '\u03b7'), // greek small letter eta,
            // U+03B7 ISOgrk3
            new CharacterReference("theta", '\u03b8'), // greek small letter
            // theta, U+03B8 ISOgrk3
            new CharacterReference("iota", '\u03b9'), // greek small letter
            // iota, U+03B9 ISOgrk3
            new CharacterReference("kappa", '\u03ba'), // greek small letter
            // kappa, U+03BA ISOgrk3
            new CharacterReference("lambda", '\u03bb'), // greek small letter
            // lambda, U+03BB
            // ISOgrk3
            new CharacterReference("mu", '\u03bc'), // greek small letter mu,
            // U+03BC ISOgrk3
            new CharacterReference("nu", '\u03bd'), // greek small letter nu,
            // U+03BD ISOgrk3
            new CharacterReference("xi", '\u03be'), // greek small letter xi,
            // U+03BE ISOgrk3
            new CharacterReference("omicron", '\u03bf'), // greek small letter
            // omicron, U+03BF
            // NEW
            new CharacterReference("pi", '\u03c0'), // greek small letter pi,
            // U+03C0 ISOgrk3
            new CharacterReference("rho", '\u03c1'), // greek small letter rho,
            // U+03C1 ISOgrk3
            new CharacterReference("sigmaf", '\u03c2'), // greek small letter
            // final sigma, U+03C2
            // ISOgrk3
            new CharacterReference("sigma", '\u03c3'), // greek small letter
            // sigma, U+03C3 ISOgrk3
            new CharacterReference("tau", '\u03c4'), // greek small letter tau,
            // U+03C4 ISOgrk3
            new CharacterReference("upsilon", '\u03c5'), // greek small letter
            // upsilon, U+03C5
            // ISOgrk3
            new CharacterReference("phi", '\u03c6'), // greek small letter phi,
            // U+03C6 ISOgrk3
            new CharacterReference("chi", '\u03c7'), // greek small letter chi,
            // U+03C7 ISOgrk3
            new CharacterReference("psi", '\u03c8'), // greek small letter psi,
            // U+03C8 ISOgrk3
            new CharacterReference("omega", '\u03c9'), // greek small letter
            // omega, U+03C9 ISOgrk3
            new CharacterReference("thetasym", '\u03d1'), // greek small letter
            // theta symbol,
            // U+03D1 NEW
            new CharacterReference("upsih", '\u03d2'), // greek upsilon with
            // hook symbol, U+03D2
            // NEW
            new CharacterReference("piv", '\u03d6'), // greek pi symbol, U+03D6
            // ISOgrk3
            // General Punctuation
            new CharacterReference("bull", '\u2022'), // bullet = black small
            // circle, U+2022 ISOpub
            // bullet is NOT the same as bullet operator, U+2219
            new CharacterReference("hellip", '\u2026'), // horizontal ellipsis =
            // three dot leader,
            // U+2026 ISOpub
            new CharacterReference("prime", '\u2032'), // prime = minutes =
            // feet, U+2032 ISOtech
            new CharacterReference("Prime", '\u2033'), // double prime = seconds
            // = inches, U+2033
            // ISOtech
            new CharacterReference("oline", '\u203e'), // overline = spacing
            // overscore, U+203E NEW
            new CharacterReference("frasl", '\u2044'), // fraction slash, U+2044
            // NEW
            // Letterlike Symbols
            new CharacterReference("weierp", '\u2118'), // script capital P =
            // power set =
            // Weierstrass p, U+2118
            // ISOamso
            new CharacterReference("image", '\u2111'), // blackletter capital I
            // = imaginary part,
            // U+2111 ISOamso
            new CharacterReference("real", '\u211c'), // blackletter capital R =
            // real part symbol,
            // U+211C ISOamso
            new CharacterReference("trade", '\u2122'), // trade mark sign,
            // U+2122 ISOnum
            new CharacterReference("alefsym", '\u2135'), // alef symbol = first
            // transfinite
            // cardinal, U+2135
            // NEW
            // alef symbol is NOT the same as hebrew letter alef,
            // U+05D0 although the same glyph could be used to depict both
            // characters
            // Arrows
            new CharacterReference("larr", '\u2190'), // leftwards arrow, U+2190
            // ISOnum
            new CharacterReference("uarr", '\u2191'), // upwards arrow, U+2191
            // ISOnum
            new CharacterReference("rarr", '\u2192'), // rightwards arrow,
            // U+2192 ISOnum
            new CharacterReference("darr", '\u2193'), // downwards arrow, U+2193
            // ISOnum
            new CharacterReference("harr", '\u2194'), // left right arrow,
            // U+2194 ISOamsa
            new CharacterReference("crarr", '\u21b5'), // downwards arrow with
            // corner leftwards =
            // carriage return,
            // U+21B5 NEW
            new CharacterReference("lArr", '\u21d0'), // leftwards double arrow,
            // U+21D0 ISOtech
            // ISO 10646 does not say that lArr is the same as the 'is implied
            // by' arrow
            // but also does not have any other character for that function. So
            // ? lArr can
            // be used for 'is implied by' as ISOtech suggests
            new CharacterReference("uArr", '\u21d1'), // upwards double arrow,
            // U+21D1 ISOamsa
            new CharacterReference("rArr", '\u21d2'), // rightwards double
            // arrow, U+21D2 ISOtech
            // ISO 10646 does not say this is the 'implies' character but does
            // not have
            // another character with this function so ?
            // rArr can be used for 'implies' as ISOtech suggests
            new CharacterReference("dArr", '\u21d3'), // downwards double arrow,
            // U+21D3 ISOamsa
            new CharacterReference("hArr", '\u21d4'), // left right double
            // arrow, U+21D4 ISOamsa
            // Mathematical Operators
            new CharacterReference("forall", '\u2200'), // for all, U+2200
            // ISOtech
            new CharacterReference("part", '\u2202'), // partial differential,
            // U+2202 ISOtech
            new CharacterReference("exist", '\u2203'), // there exists, U+2203
            // ISOtech
            new CharacterReference("empty", '\u2205'), // empty set = null set =
            // diameter, U+2205
            // ISOamso
            new CharacterReference("nabla", '\u2207'), // nabla = backward
            // difference, U+2207
            // ISOtech
            new CharacterReference("isin", '\u2208'), // element of, U+2208
            // ISOtech
            new CharacterReference("notin", '\u2209'), // not an element of,
            // U+2209 ISOtech
            new CharacterReference("ni", '\u220b'), // contains as member,
            // U+220B ISOtech
            // should there be a more memorable name than 'ni'?
            new CharacterReference("prod", '\u220f'), // n-ary product = product
            // sign, U+220F ISOamsb
            // prod is NOT the same character as U+03A0 'greek capital letter
            // pi' though
            // the same glyph might be used for both
            new CharacterReference("sum", '\u2211'), // n-ary sumation, U+2211
            // ISOamsb
            // sum is NOT the same character as U+03A3 'greek capital letter
            // sigma'
            // though the same glyph might be used for both
            new CharacterReference("minus", '\u2212'), // minus sign, U+2212
            // ISOtech
            new CharacterReference("lowast", '\u2217'), // asterisk operator,
            // U+2217 ISOtech
            new CharacterReference("radic", '\u221a'), // square root = radical
            // sign, U+221A ISOtech
            new CharacterReference("prop", '\u221d'), // proportional to, U+221D
            // ISOtech
            new CharacterReference("infin", '\u221e'), // infinity, U+221E
            // ISOtech
            new CharacterReference("ang", '\u2220'), // angle, U+2220 ISOamso
            new CharacterReference("and", '\u2227'), // logical and = wedge,
            // U+2227 ISOtech
            new CharacterReference("or", '\u2228'), // logical or = vee, U+2228
            // ISOtech
            new CharacterReference("cap", '\u2229'), // intersection = cap,
            // U+2229 ISOtech
            new CharacterReference("cup", '\u222a'), // union = cup, U+222A
            // ISOtech
            new CharacterReference("int", '\u222b'), // integral, U+222B ISOtech
            new CharacterReference("there4", '\u2234'), // therefore, U+2234
            // ISOtech
            new CharacterReference("sim", '\u223c'), // tilde operator = varies
            // with = similar to,
            // U+223C ISOtech
            // tilde operator is NOT the same character as the tilde, U+007E,
            // although the same glyph might be used to represent both
            new CharacterReference("cong", '\u2245'), // approximately equal to,
            // U+2245 ISOtech
            new CharacterReference("asymp", '\u2248'), // almost equal to =
            // asymptotic to, U+2248
            // ISOamsr
            new CharacterReference("ne", '\u2260'), // not equal to, U+2260
            // ISOtech
            new CharacterReference("equiv", '\u2261'), // identical to, U+2261
            // ISOtech
            new CharacterReference("le", '\u2264'), // less-than or equal to,
            // U+2264 ISOtech
            new CharacterReference("ge", '\u2265'), // greater-than or equal to,
            // U+2265 ISOtech
            new CharacterReference("sub", '\u2282'), // subset of, U+2282
            // ISOtech
            new CharacterReference("sup", '\u2283'), // superset of, U+2283
            // ISOtech
            // note that nsup, 'not a superset of, U+2283' is not covered by the
            // Symbol
            // font encoding and is not included. Should it be, for symmetry?
            // It is in ISOamsn
            new CharacterReference("nsub", '\u2284'), // not a subset of, U+2284
            // ISOamsn
            new CharacterReference("sube", '\u2286'), // subset of or equal to,
            // U+2286 ISOtech
            new CharacterReference("supe", '\u2287'), // superset of or equal
            // to, U+2287 ISOtech
            new CharacterReference("oplus", '\u2295'), // circled plus = direct
            // sum, U+2295 ISOamsb
            new CharacterReference("otimes", '\u2297'), // circled times =
            // vector product,
            // U+2297 ISOamsb
            new CharacterReference("perp", '\u22a5'), // up tack = orthogonal to
            // = perpendicular,
            // U+22A5 ISOtech
            new CharacterReference("sdot", '\u22c5'), // dot operator, U+22C5
            // ISOamsb
            // dot operator is NOT the same character as U+00B7 middle dot
            // Miscellaneous Technical
            new CharacterReference("lceil", '\u2308'), // left ceiling = apl
            // upstile, U+2308
            // ISOamsc
            new CharacterReference("rceil", '\u2309'), // right ceiling, U+2309
            // ISOamsc
            new CharacterReference("lfloor", '\u230a'), // left floor = apl
            // downstile, U+230A
            // ISOamsc
            new CharacterReference("rfloor", '\u230b'), // right floor, U+230B
            // ISOamsc
            new CharacterReference("lang", '\u2329'), // left-pointing angle
            // bracket = bra, U+2329
            // ISOtech
            // lang is NOT the same character as U+003C 'less than'
            // or U+2039 'single left-pointing angle quotation mark'
            new CharacterReference("rang", '\u232a'), // right-pointing angle
            // bracket = ket, U+232A
            // ISOtech
            // rang is NOT the same character as U+003E 'greater than'
            // or U+203A 'single right-pointing angle quotation mark'
            // Geometric Shapes
            new CharacterReference("loz", '\u25ca'), // lozenge, U+25CA ISOpub
            // Miscellaneous Symbols
            new CharacterReference("spades", '\u2660'), // black spade suit,
            // U+2660 ISOpub
            // black here seems to mean filled as opposed to hollow
            new CharacterReference("clubs", '\u2663'), // black club suit =
            // shamrock, U+2663
            // ISOpub
            new CharacterReference("hearts", '\u2665'), // black heart suit =
            // valentine, U+2665
            // ISOpub
            new CharacterReference("diams", '\u2666'), // black diamond suit,
            // U+2666 ISOpub
            // Special characters for HTML
            // Character entity set. Typical invocation:
            // <!ENTITY % HTMLspecial PUBLIC
            // "-//W3C//ENTITIES Special//EN//HTML">
            // %HTMLspecial;
            // Portions © International Organization for Standardization 1986:
            // Permission to copy in any form is granted for use with
            // conforming SGML systems and applications as defined in
            // ISO 8879, provided this notice is included in all copies.
            // Relevant ISO entity set is given unless names are newly
            // introduced.
            // New names (i.e., not in ISO 8879 list) do not clash with any
            // existing ISO 8879 entity names. ISO 10646 character numbers
            // are given for each character, in hex. CDATA values are decimal
            // conversions of the ISO 10646 values and refer to the document
            // character set. Names are ISO 10646 names.
            // C0 Controls and Basic Latin
            new CharacterReference("quot", '\u0022'), // quotation mark = APL
            // quote, U+0022 ISOnum
            new CharacterReference("amp", '\u0026'), // ampersand, U+0026 ISOnum
            new CharacterReference("lt", '\u003c'), // less-than sign, U+003C
            // ISOnum
            new CharacterReference("gt", '\u003e'), // greater-than sign, U+003E
            // ISOnum
            // Latin Extended-A
            new CharacterReference("OElig", '\u0152'), // latin capital ligature
            // OE, U+0152 ISOlat2
            new CharacterReference("oelig", '\u0153'), // latin small ligature
            // oe, U+0153 ISOlat2
            // ligature is a misnomer, this is a separate character in some
            // languages
            new CharacterReference("Scaron", '\u0160'), // latin capital letter
            // S with caron, U+0160
            // ISOlat2
            new CharacterReference("scaron", '\u0161'), // latin small letter s
            // with caron, U+0161
            // ISOlat2
            new CharacterReference("Yuml", '\u0178'), // latin capital letter Y
            // with diaeresis,
            // U+0178 ISOlat2
            // Spacing Modifier Letters
            new CharacterReference("circ", '\u02c6'), // modifier letter
            // circumflex accent,
            // U+02C6 ISOpub
            new CharacterReference("tilde", '\u02dc'), // small tilde, U+02DC
            // ISOdia
            // General Punctuation
            new CharacterReference("ensp", '\u2002'), // en space, U+2002 ISOpub
            new CharacterReference("emsp", '\u2003'), // em space, U+2003 ISOpub
            new CharacterReference("thinsp", '\u2009'), // thin space, U+2009
            // ISOpub
            new CharacterReference("zwnj", '\u200c'), // zero width non-joiner,
            // U+200C NEW RFC 2070
            new CharacterReference("zwj", '\u200d'), // zero width joiner,
            // U+200D NEW RFC 2070
            new CharacterReference("lrm", '\u200e'), // left-to-right mark,
            // U+200E NEW RFC 2070
            new CharacterReference("rlm", '\u200f'), // right-to-left mark,
            // U+200F NEW RFC 2070
            new CharacterReference("ndash", '\u2013'), // en dash, U+2013 ISOpub
            new CharacterReference("mdash", '\u2014'), // em dash, U+2014 ISOpub
            new CharacterReference("lsquo", '\u2018'), // left single quotation
            // mark, U+2018 ISOnum
            new CharacterReference("rsquo", '\u2019'), // right single quotation
            // mark, U+2019 ISOnum
            new CharacterReference("sbquo", '\u201a'), // single low-9 quotation
            // mark, U+201A NEW
            new CharacterReference("ldquo", '\u201c'), // left double quotation
            // mark, U+201C ISOnum
            new CharacterReference("rdquo", '\u201d'), // right double quotation
            // mark, U+201D ISOnum
            new CharacterReference("bdquo", '\u201e'), // double low-9 quotation
            // mark, U+201E NEW
            new CharacterReference("dagger", '\u2020'), // dagger, U+2020 ISOpub
            new CharacterReference("Dagger", '\u2021'), // double dagger, U+2021
            // ISOpub
            new CharacterReference("permil", '\u2030'), // per mille sign,
            // U+2030 ISOtech
            new CharacterReference("lsaquo", '\u2039'), // single left-pointing
            // angle quotation mark,
            // U+2039 ISO proposed
            // lsaquo is proposed but not yet ISO standardized
            new CharacterReference("rsaquo", '\u203a'), // single right-pointing
            // angle quotation mark,
            // U+203A ISO proposed
            // rsaquo is proposed but not yet ISO standardized
            new CharacterReference("euro", '\u20ac'), // euro sign, U+20AC NEW
    };

    /**
     * The dividing point between a simple table lookup and a binary search.
     * Characters below the break point are stored in a sparse array allowing
     * direct index lookup.
     */
    protected static final int BREAKPOINT = 0x100;

    /**
     * List of references sorted by character. The first part of this array, up
     * to <code>BREAKPOINT</code> is stored in a direct translational table,
     * indexing into the table with a character yields the reference. The second
     * part is dense and sorted by character, suitable for binary lookup.
     */
    protected static final CharacterReference[] mCharacterList;

    static {
        int index;
        CharacterReference item;
        int character;

        // count below the break point
        index = 0;
        for (int i = 0; i < mCharacterReferences.length; i++)
            if (mCharacterReferences[i].getCharacter() < BREAKPOINT) index++;
        // allocate enough for the linear table and remainder
        mCharacterList = new CharacterReference[BREAKPOINT + mCharacterReferences.length - index];
        index = BREAKPOINT;
        for (int i = 0; i < mCharacterReferences.length; i++) {
            item = mCharacterReferences[i];
            character = mCharacterReferences[i].getCharacter();
            if (character < BREAKPOINT)
                mCharacterList[character] = item;
            else {
                // use a linear search and insertion sort, done only once
                int x = BREAKPOINT;
                while (x < index)
                    if (mCharacterList[x].getCharacter() > character)
                        break;
                    else
                        x++;
                int y = index - 1;
                while (y >= x) {
                    mCharacterList[y + 1] = mCharacterList[y];
                    y--;
                }
                mCharacterList[x] = item;
                index++;
            }
        }
        // reorder the original array into kernel order
        Sort.QuickSort(mCharacterReferences);
    }

    /**
     * Private constructor. This class is fully static and thread safe.
     */
    private Translate() {
    }

    /**
     * Binary search for a reference.
     *
     * @param array The array of <code>CharacterReference</code> objects.
     * @param ref   The character to search for.
     * @param lo    The lower index within which to look.
     * @param hi    The upper index within which to look.
     * @return The index at which reference was found or is to be inserted.
     */
    protected static int lookup(CharacterReference[] array, char ref, int lo, int hi) {
        int num;
        int mid;
        int half;
        int result;
        int ret;

        ret = -1;

        num = (hi - lo) + 1;
        while ((-1 == ret) && (lo <= hi)) {
            half = num / 2;
            mid = lo + ((0 != (num & 1)) ? half : half - 1);
            result = ref - array[mid].getCharacter();
            if (0 == result)
                ret = mid;
            else if (0 > result) {
                hi = mid - 1;
                num = ((0 != (num & 1)) ? half : half - 1);
            } else {
                lo = mid + 1;
                num = half;
            }
        }
        if (-1 == ret) ret = lo;

        return (ret);
    }

    /**
     * Look up a reference by character. Use a combination of direct table
     * lookup and binary search to find the reference corresponding to the
     * character.
     *
     * @param character The character to be looked up.
     * @return The entity reference for that character or <code>null</code>.
     */
    public static CharacterReference lookup(char character) {
        int index;
        CharacterReference ret;

        if (character < BREAKPOINT)
            ret = mCharacterList[character];
        else {
            index = lookup(mCharacterList, character, BREAKPOINT, mCharacterList.length - 1);
            if (index < mCharacterList.length) {
                ret = mCharacterList[index];
                if (character != ret.getCharacter()) ret = null;
            } else
                ret = null;
        }

        return (ret);
    }

    /**
     * Look up a reference by kernel. Use a binary search on the ordered list of
     * known references. Since the binary search returns the position at which a
     * new item should be inserted, we check the references earlier in the list
     * if there is a failure.
     *
     * @param key A character reference with the kernel set to the string to be
     *            found. It need not be truncated at the exact end of the
     *            reference.
     */
    protected static CharacterReference lookup(CharacterReference key) {
        String string;
        int index;
        String kernel;
        char character;
        CharacterReference test;
        CharacterReference ret;

        // Care should be taken here because some entity references are
        // prefixes of others, i.e.:
        // \u2209[notin] \u00ac[not]
        // \u00ba[ordm] \u2228[or]
        // \u03d6[piv] \u03c0[pi]
        // \u00b3[sup3] \u2283[sup]
        ret = null;
        index = Sort.bsearch(mCharacterReferences, key);
        string = key.getKernel();
        if (index < mCharacterReferences.length) {
            ret = mCharacterReferences[index];
            kernel = ret.getKernel();
            if (!string.regionMatches(0, kernel, 0, kernel.length())) { // not exact, check references starting with same character
                // to see if a subset matches
                ret = null;
            }
        }
        if (null == ret) {
            character = string.charAt(0);
            while (--index >= 0) {
                test = mCharacterReferences[index];
                kernel = test.getKernel();
                if (character == kernel.charAt(0)) {
                    if (string.regionMatches(0, kernel, 0, kernel.length())) {
                        ret = test;
                        break;
                    }
                } else
                    break;
            }
        }

        return (ret);
    }

    /**
     * Look up a reference by kernel. Use a binary search on the ordered list of
     * known references.
     * <em>This is not very efficient, use {@link org.htmlparser.util.Translate#lookup(org.htmlparser.util.CharacterReference) lookup(CharacterReference)}
     * instead.</em>
     *
     * @param kernel The string to lookup, i.e. "amp".
     * @param start  The starting point in the string of the kernel.
     * @param end    The ending point in the string of the kernel. This should be
     *               the index of the semicolon if it exists, or failing that, at
     *               least an index past the last character of the kernel.
     * @return The reference that matches the given string, or <code>null</code>
     *         if it wasn't found.
     */
    public static CharacterReference lookup(String kernel, int start, int end) {
        CharacterReferenceEx probe;

        probe = new CharacterReferenceEx();
        probe.setKernel(kernel);
        probe.setStart(start);
        probe.setEnd(end);

        return (lookup(probe));
    }

    /**
     * Decode a string containing references. Change all numeric character
     * reference and character entity references to unicode characters.
     *
     * @param string The string to translate.
     */
    public static String decode(String string) {
        CharacterReferenceEx key;
        int amp;
        int index;
        int length;
        StringBuffer buffer;
        char character;
        int number;
        int radix;
        int i;
        int semi;
        boolean done;
        CharacterReference item;
        String ret;

        if (-1 == (amp = string.indexOf('&')))
            ret = string;
        else {
            key = null;
            index = 0;
            length = string.length();
            buffer = new StringBuffer(length);
            do {
                // equivalent to buffer.append (string.substring (index, amp));
                // but without the allocation of a new String
                while (index < amp)
                    buffer.append(string.charAt(index++));

                index++;
                if (index < length) {
                    character = string.charAt(index);
                    if ('#' == character) {
                        // numeric character reference
                        index++;
                        number = 0;
                        radix = 0;
                        i = index;
                        done = false;
                        while ((i < length) && !done) {
                            character = string.charAt(i);
                            switch (character) {
                                case '0':
                                case '1':
                                case '2':
                                case '3':
                                case '4':
                                case '5':
                                case '6':
                                case '7':
                                case '8':
                                case '9':
                                    if (0 == radix) radix = 10;
                                    number = number * radix + (character - '0');
                                    break;
                                case 'A':
                                case 'B':
                                case 'C':
                                case 'D':
                                case 'E':
                                case 'F':
                                    if (16 == radix)
                                        number = number * radix + (character - 'A' + 10);
                                    else
                                        done = true;
                                    break;
                                case 'a':
                                case 'b':
                                case 'c':
                                case 'd':
                                case 'e':
                                case 'f':
                                    if (16 == radix)
                                        number = number * radix + (character - 'a' + 10);
                                    else
                                        done = true;
                                    break;
                                case 'x':
                                case 'X':
                                    if (0 == radix)
                                        radix = 16;
                                    else
                                        done = true;
                                    break;
                                case ';':
                                    done = true;
                                    i++;
                                    break;
                                default:
                                    done = true;
                                    break;
                            }
                            if (!done) i++;
                        }
                        if (0 != number) {
                            buffer.append((char) number);
                            index = i;
                            amp = index;
                        }

                    } else if (Character.isLetter(character)) // really can't
                    // start with a
                    // digit eh...
                    {
                        // character entity reference
                        i = index + 1;
                        done = false;
                        semi = length;
                        while ((i < length) && !done) {
                            character = string.charAt(i);
                            if (';' == character) {
                                done = true;
                                semi = i;
                                i++;
                            } else if (Character.isLetterOrDigit(character))
                                i++;
                            else {
                                done = true;
                                semi = i;
                            }
                        }
                        // new CharacterReference (string.substring (index,
                        // semi), 0);
                        if (null == key) key = new CharacterReferenceEx();
                        key.setKernel(string);
                        key.setStart(index);
                        key.setEnd(semi);
                        item = lookup(key);
                        if (null != item) {
                            buffer.append((char) item.getCharacter());
                            index += item.getKernel().length();
                            if ((index < length) && (';' == string.charAt(index))) index++;
                            amp = index;
                        }
                    } else {
                        // need do nothing here, the ampersand will be consumed
                        // below
                    }
                }
                // gather up unconsumed characters
                while (amp < index)
                    buffer.append(string.charAt(amp++));
            } while ((index < length) && (-1 != (amp = string.indexOf('&', index))));
            // equivalent to buffer.append (string.substring (index));
            // but without the allocation of a new String
            while (index < length)
                buffer.append(string.charAt(index++));
            ret = buffer.toString();
        }

        return (ret);
    }

    /**
     * Decode the characters in a string buffer containing references. Change
     * all numeric character reference and character entity references to
     * unicode characters.
     *
     * @param buffer The StringBuffer containing references.
     * @return The decoded string.
     */
    public static String decode(StringBuffer buffer) {
        return decode(buffer.toString());
    }

    /**
     * Decode a stream containing references. Change all numeric character
     * reference and character entity references to unicode characters. If
     * <code>DECODE_LINE_BY_LINE</code> is true, the input stream is broken up
     * into lines, terminated by either carriage return or newline, in order to
     * reduce the latency and maximum buffering memory size required.
     *
     * @param in  The stream to translate. It is assumed that the input stream
     *            is encoded with ISO-8859-1 since the table of character entity
     *            references in this class applies only to ISO-8859-1.
     * @param out The stream to write the decoded stream to.
     */
    public static void decode(InputStream in, PrintStream out) {
        Reader reader;
        StringBuffer buffer;
        int character;
        String string;
        boolean newlines;

        try {
            try {
                reader = new BufferedReader(new InputStreamReader(in, "ISO-8859-1"));
            } catch (UnsupportedEncodingException use) {
                // yeah, like this will happen; OK, assume the default is
                // ISO-8859-1
                reader = new BufferedReader(new InputStreamReader(in));
            }
            buffer = new StringBuffer(1024);
            newlines = false;
            if (DECODE_LINE_BY_LINE)
                while (-1 != (character = reader.read())) {
                    if (('\r' == character) || ('\n' == character)) {
                        if (!newlines) {
                            string = decode(buffer.toString());
                            out.print(string);
                            buffer.setLength(0);
                            newlines = true;
                        }
                        buffer.append((char) character);
                    } else {
                        if (newlines) {
                            out.print(buffer.toString());
                            buffer.setLength(0);
                            newlines = false;
                        }
                        buffer.append((char) character);
                    }
                }
            else
                while (-1 != (character = reader.read()))
                    buffer.append((char) character);
            if (0 != buffer.length()) {
                if (newlines)
                    out.print(buffer.toString());
                else {
                    string = decode(buffer.toString());
                    out.print(string);
                }
            }
        } catch (IOException ioe) {
            out.println();
            out.println(ioe.getMessage());
        } finally {
            out.flush();
        }
    }

    /**
     * Convert a character to a numeric character reference. Convert a unicode
     * character to a numeric character reference of the form &amp;#xxxx;.
     *
     * @param character The character to convert.
     * @return The converted character.
     */
    public static String encode(int character) {
        StringBuffer ret;

        ret = new StringBuffer(13); /* &#2147483647; */
        ret.append("&#");
        if (ENCODE_HEXADECIMAL) {
            ret.append("x");
            ret.append(Integer.toHexString(character));
        } else
            ret.append(character);
        ret.append(';');

        return (ret.toString());
    }

    /**
     * Encode a string to use references. Change all characters that are not
     * ISO-8859-1 to their numeric character reference or character entity
     * reference.
     *
     * @param string The string to translate.
     * @return The encoded string.
     */
    public static String encode(String string) {
        int length;
        char c;
        CharacterReference candidate;
        StringBuffer ret;

        ret = new StringBuffer(string.length() * 6);
        length = string.length();
        for (int i = 0; i < length; i++) {
            c = string.charAt(i);
            candidate = lookup(c);
            if (null != candidate) {
                ret.append('&');
                ret.append(candidate.getKernel());
                ret.append(';');
            } else if (!(c < 0x007F)) {
                ret.append("&#");
                if (ENCODE_HEXADECIMAL) {
                    ret.append("x");
                    ret.append(Integer.toHexString(c));
                } else
                    ret.append((int) c);
                ret.append(';');
            } else
                ret.append(c);
        }

        return (ret.toString());
    }

    /**
     * Encode a stream to use references. Change all characters that are not
     * ISO-8859-1 to their numeric character reference or character entity
     * reference.
     *
     * @param in  The stream to translate. It is assumed that the input stream
     *            is encoded with ISO-8859-1 since the table of character entity
     *            references in this class applies only to ISO-8859-1.
     * @param out The stream to write the decoded stream to.
     */
    public static void encode(InputStream in, PrintStream out) {
        Reader reader;
        char c;
        int index;
        CharacterReference candidate;
        PrintWriter output;

        try {
            reader = new BufferedReader(new InputStreamReader(in, "ISO-8859-1"));
            output = new PrintWriter(new BufferedWriter(new OutputStreamWriter(out, "ISO-8859-1")));
        } catch (UnsupportedEncodingException use) {
            // yeah, like this will happen; OK, assume default is ISO-8859-1
            reader = new BufferedReader(new InputStreamReader(in));
            output = new PrintWriter(new BufferedWriter(new OutputStreamWriter(out)));
        }
        try {
            while (-1 != (index = reader.read())) {
                c = (char) index;
                candidate = lookup(c);
                if (null != candidate) {
                    output.print('&');
                    output.print(candidate.getKernel());
                    output.print(';');
                } else if (!(c < 0x007F)) {
                    output.print("&#");
                    if (ENCODE_HEXADECIMAL) {
                        output.print("x");
                        output.print(Integer.toHexString(c));
                    } else
                        output.print((int) c);
                    output.print(';');
                } else
                    output.print(c);
            }
        } catch (IOException ioe) {
            output.println();
            output.println(ioe.getMessage());
        } finally {
            output.flush();
        }
    }

    /**
     * Numeric character reference and character entity reference to unicode
     * codec. Translate the <code>System.in</code> input into an encoded or
     * decoded stream and send the results to <code>System.out</code>.
     *
     * @param args If arg[0] is <code>-encode</code> perform an encoding on
     *             <code>System.in</code>, otherwise perform a decoding.
     */
    public static void main(String[] args) {
        boolean encode;

        if (0 < args.length && args[0].equalsIgnoreCase("-encode"))
            encode = true;
        else
            encode = false;
        if (encode)
            encode(System.in, System.out);
        else
            decode(System.in, System.out);
    }
}
